#-------------------------Package Installer--------------------------
# load packages and install if missing
# thanks to Richard Schwinn for the original code, http://stackoverflow.com/a/33876492

# list the packages you need
p <- c('data.table', 'ggplot2', 'sf', 'psych', 'reshape2', 'tidyverse',
       'dplyr', 'corrgram', 'spdep', 'mapview', 'tmap', 'ggpubr', 'spatialreg')

# this is a package loading function
loadpacks <- function(package.list = p){
new.packages <- package.list[!(package.list %in% installed.packages()[,'Package'])]
  if(length(new.packages)) {
    install.packages(new.packages, Ncpus = parallel::detectCores(), type = "binary", repos = "https://cran.rstudio.com")
  }
lapply(eval(package.list), require, character.only = TRUE)
}

loadpacks(p) # calling function to load and/or install packages
rm(loadpacks, p) # cleanup namespace
#----------------------End of Package Installer----------------------

#------------------------------Options-------------------------------

data.table::setDTthreads(threads = parallel::detectCores())
options(scipen = 999)

#---------------------------End of Options---------------------------

main <- function() {
    
    
    
}

rm(main)

1 Code book

You will be using a number of datasets in this lab.

1.1 income_est

This dataset is a summary of income istimates (in british pounds) for 2002-2013 by London boroughs. The variables are as follows:

  • Code = unique area code
  • Borough = London borough
  • the rest of the columns are dates = Total Median Annual Household Income estimate for each year

1.2 qualifications

This dataset is an extract from the full qualifications dataset - few variables are selected form the full data set and the year is 2011. The variables are as follows:

  • Code = unique area code
  • Borough = London borough
  • share_qualified = share of highly qualified people of working age (16-64)
  • share_unqualified = share of unqualified people of working age (16-64)

2 Load data

2.1 Load data from income_est file from data subfolder into a data.table or tibble object with a name of your choice.

You might need to tell the function you are using to read the csv file that you have column names in the first row.

income <- fread("data/income_est.csv", header=TRUE)

2.2 Load qualifications file from data subfolder into a data.table or tibble object with a name of your choice.

You might need to tell the function you are using to read the csv file that you have column names in the first row. For fread() it would be a header = TRUE parameter.

qual <- fread("data/qualifications.csv", header=TRUE)

3 Get to know the datasets

3.1 Check the structure of the dataset with data from the income_est file

str(income)
## Classes 'data.table' and 'data.frame':   33 obs. of  14 variables:
##  $ Code   : chr  "E09000001" "E09000002" "E09000003" "E09000004" ...
##  $ Borough: chr  "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
##  $ 2002   : int  42720 20120 29480 25490 24280 29550 31300 26450 27160 25100 ...
##  $ 2003   : int  43620 20410 29780 25860 24400 30040 31710 26780 27350 25240 ...
##  $ 2004   : int  45150 21020 30520 26620 24900 30980 32580 27500 27940 25750 ...
##  $ 2005   : int  48070 22300 32200 28220 26160 32880 34450 29070 29380 27050 ...
##  $ 2006   : int  48380 22390 32150 28310 26010 33010 34460 29080 29240 26900 ...
##  $ 2007   : int  51410 23750 33900 30010 27340 35000 36400 30720 30740 28260 ...
##  $ 2008   : int  53110 24520 34790 30950 27960 36110 37410 31570 31450 28900 ...
##  $ 2009   : int  54960 25360 35770 31990 28660 37310 38510 32510 32230 29610 ...
##  $ 2010   : int  57520 26550 37220 33460 29730 39020 40110 33870 33430 30700 ...
##  $ 2011   : int  59240 27360 38120 34440 30370 40150 41120 34730 34140 31340 ...
##  $ 2012   : int  62290 28780 39880 36210 31690 42180 43030 36370 35600 32680 ...
##  $ 2013   : int  63620 29420 40530 36990 32140 43060 43750 37000 36070 33110 ...
##  - attr(*, ".internal.selfref")=<externalptr>

3.2 Show first and last three lines of the dataset with data from the income_est file

headTail(income, top=3, bottom=3)
##        Code              Borough X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1 E09000001       City of London 42720 43620 45150 48070 48380 51410 53110
## 2 E09000002 Barking and Dagenham 20120 20410 21020 22300 22390 23750 24520
## 3 E09000003               Barnet 29480 29780 30520 32200 32150 33900 34790
## 4      <NA>                 <NA>   ...   ...   ...   ...   ...   ...   ...
## 5 E09000031       Waltham Forest 23000 23320 23990 25420 25490 27000 27820
## 6 E09000032           Wandsworth 32800 33340 34370 36450 36580 38760 39960
## 7 E09000033          Westminster 33190 33740 34770 36880 36980 39150 40320
##   X2009 X2010 X2011 X2012 X2013
## 1 54960 57520 59240 62290 63620
## 2 25360 26550 27360 28780 29420
## 3 35770 37220 38120 39880 40530
## 4   ...   ...   ...   ...   ...
## 5 28730 30020 30870 32420 33080
## 6 41260 43110 44330 46550 47480
## 7 41580 43380 44530 46670 47510

3.3 Check the structure of the dataset with data from the qualifications file

str(qual)
## Classes 'data.table' and 'data.frame':   32 obs. of  4 variables:
##  $ Code             : chr  "E09000002" "E09000003" "E09000004" "E09000005" ...
##  $ Borough          : chr  "Barking and Dagenham" "Barnet" "Bexley" "Brent" ...
##  $ share_qualified  : num  26.9 50.4 27.2 30.2 43.5 55.2 36.7 47.1 36.8 43.2 ...
##  $ share_unqualified: num  13.9 6.1 8.6 11.2 7.5 8.5 8.9 11.1 8.7 10.7 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# no record of city of London

3.4 Show first and last three lines of the dataset with data from the qual file

headTail(qual, top=3, bottom=3)
##        Code              Borough share_qualified share_unqualified
## 1 E09000002 Barking and Dagenham            26.9              13.9
## 2 E09000003               Barnet            50.4               6.1
## 3 E09000004               Bexley            27.2               8.6
## 4      <NA>                 <NA>             ...               ...
## 5 E09000031       Waltham Forest            38.4              11.3
## 6 E09000032           Wandsworth            64.5               5.6
## 7 E09000033          Westminster            60.1               6.8

4 Transform and explore data

4.1 Transform data from the income_est file

Transfrom from the income_est file so that it has the following columns:

  • Code = unique area code
  • Borough = London borough
  • Year = year
  • Income = median income in particular year

You may name the variables as you like.

Save the result to an object, give it any name you like.

income_molten <- melt(income, id.vars=c('Code', 'Borough'), 
                      variable.name='Year', value.name = 'income_city')

income_by_year <- income_molten %>% group_by(Year) %>% summarise( MedianIncomeYear = median(income_city) )

https://stackoverflow.com/questions/54366592/r-how-to-take-median-of-rows-in-dataframe

income_new <- merge(income_molten, income_by_year,
                        by.x='Year', by.y='Year')

names(income_new)[5] <- 'Income'
names(income_new)[4] <- 'Income_est'
#income_new <- income_new[-c(4)]

4.3 Summarize data

Using the dataset you created in the previous step, print top 5 boroughs by max income.

income_new_copy <- income_new %>% group_by(Borough) %>% summarise( MedianIncomeBorough = median(Income_est) )

sorted <- arrange(income_new_copy, desc(MedianIncomeBorough))

sorted[1:5,]
## # A tibble: 5 x 2
##   Borough                MedianIncomeBorough
##   <chr>                                <dbl>
## 1 City of London                       52260
## 2 Kensington and Chelsea               45480
## 3 Richmond upon Thames                 43950
## 4 Westminster                          39735
## 5 Wandsworth                           39360

Using the dataset you created in the previous step, print only 6th to 10th top boroughs by income.

sorted[6:10,]
## # A tibble: 5 x 2
##   Borough                MedianIncomeBorough
##   <chr>                                <dbl>
## 1 Kingston upon Thames                 36965
## 2 Camden                               36905
## 3 Hammersmith and Fulham               36520
## 4 Bromley                              35555
## 5 Merton                               34975

4.4 Plot top 10 boroughs

Create a bar plot comparing the top 10 boroughs by income.

Hint 1: use stat = 'identity' parameter for bar plot geom. Hint 2: use + coord_flip() with your plot for better readability.

https://stackoverflow.com/questions/59008974/why-is-stat-identity-necessary-in-geom-bar-in-ggplot

sorted[1:10,] %>% ggplot(aes(Borough, MedianIncomeBorough)) +
  geom_bar(stat = "identity") +
  coord_flip()

5 Enrich the data

Select only one year worth of data from income_est (choose the year that corresponds to the year of qulifications data) and save it as a new data.table or tibble. You should keep the Code, the Borough name and the income estimate column.

2011 год

income_2011 <- filter(income_new, Year == 2011)
head(income_2011)
##   Year      Code              Borough Income_est Income
## 1 2011 E09000001       City of London      59240  34730
## 2 2011 E09000002 Barking and Dagenham      27360  34730
## 3 2011 E09000003               Barnet      38120  34730
## 4 2011 E09000004               Bexley      34440  34730
## 5 2011 E09000005                Brent      30370  34730
## 6 2011 E09000006              Bromley      40150  34730

Merge your new income estimate table with the qualifications table and save the result as a new table (you should get a 32x5 table with Code, Borough, share qualified, share unqualified and estimated income columns, all variables should be for the same year or within 1 year):

data <- merge(income_2011, qual, by.x='Code', by.y='Code', all.x=TRUE)
names(data)[3] <- 'Borough'
data <- data %>% select(!c(Borough.y, Year, Income))
head(data)
##        Code              Borough Income_est share_qualified share_unqualified
## 1 E09000001       City of London      59240              NA                NA
## 2 E09000002 Barking and Dagenham      27360            26.9              13.9
## 3 E09000003               Barnet      38120            50.4               6.1
## 4 E09000004               Bexley      34440            27.2               8.6
## 5 E09000005                Brent      30370            30.2              11.2
## 6 E09000006              Bromley      40150            43.5               7.5

Create a scatter plot for each pair of variables in the new table Or you can create a corrgram for all variables at once if you recall how to do it from lab 6.

panel.shadeNtext <- function (x, y, corr = NULL, col.regions, ...) 
{
      corr <- cor(x, y, use = "pair")
      results <- cor.test(x, y, alternative = "two.sided")
      est <- results$p.value
      stars <- ifelse(est < 5e-4, "***", 
                      ifelse(est < 5e-3, "**", 
                             ifelse(est < 5e-2, "*", "")))
      ncol <- 14
      pal <- col.regions(ncol)
      col.ind <- as.numeric(cut(corr, breaks = seq(from = -1, to = 1, 
                                                   length = ncol + 1), include.lowest = TRUE))
      usr <- par("usr")
      rect(usr[1], usr[3], usr[2], usr[4], col = pal[col.ind], 
           border = NA)
      box(col = "lightgray")
      on.exit(par(usr))
      par(usr = c(0, 1, 0, 1))
      r <- formatC(corr, digits = 2, format = "f")
      cex.cor <- .8/strwidth("-X.xx")
      fonts <- ifelse(stars != "", 2,1)
      # option 1: stars:
      text(0.5, 0.4, paste0(r,"\n", stars), cex = cex.cor)
      # option 2: bolding:
      #text(0.5, 0.5, r, cex = cex.cor, font=fonts)
}
corrgram(data, upper.panel = panel.pts, lower.panel = panel.shadeNtext, diag.panel = panel.density)

Expectedly, a higher proportion of skilled workers is significantly positively associated with neighborhood income, while a higher proportion of unskilled workers is associated with higher income.

Test all variables for normality. Do you need to log-transform any variables? You may choose to answer that using plots or empirically (by trying to build models).

hist(data$Income_est, breaks=33)

shapiro.test(data$Income_est)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$Income_est
## W = 0.89103, p-value = 0.003143

The income variable has to be transformed.

hist(data$share_qualified, breaks=33)

shapiro.test(data$share_qualified)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$share_qualified
## W = 0.97594, p-value = 0.6759
hist(data$share_unqualified, breaks=33)

shapiro.test(data$share_unqualified)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$share_unqualified
## W = 0.97358, p-value = 0.6034

The p-value in the Shapiro test is greater than the critical value, so we can say that we have no significant differences from the normal distribution. You can also see on the correlogram plots that the distribution is close to normal.

__

Build a multiple linear regression model. Try to predict income by the shares of qualified and unqualified workforce.

m.1 <- lm(log(Income_est) ~ share_qualified + share_unqualified, data=data)

Print model summary:

summary(m.1)
## 
## Call:
## lm(formula = log(Income_est) ~ share_qualified + share_unqualified, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16413 -0.04821 -0.01657  0.04850  0.21646 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       10.540363   0.110571  95.327 < 0.0000000000000002 ***
## share_qualified    0.005698   0.001570   3.628              0.00109 ** 
## share_unqualified -0.033263   0.006566  -5.066            0.0000211 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08888 on 29 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6779, Adjusted R-squared:  0.6557 
## F-statistic: 30.52 on 2 and 29 DF,  p-value: 0.00000007327

Interpret the model by analysing the residuals and inspecting the key model summary coefficients.

ggResidpanel::resid_panel(m.1)

The residuals do not depend on predicated values, and we can also see from the QQ-plot that the range of values is close to the line. This tells us that the model has sufficient predictive power.

Apart from code, provide a brief description of how you can interpret the results.

It can be noticed that both coefficients are significant. Like in the correlations, the increase in the share of qualified workers increases district income on average, ceteris paribus. In contrast, the increase in the share of unqualified labor workers decreases district income on average, ceteris paribus.

Do you think the model may suffer from spatial autocorrelation? You will be able to explore this later in the lab.

6 Working with spatial data

6.1 Load spatial data

Load london_boroughs spatial data from data sub folder to object with any name. Make sure it has the same number of borouhgs as your table from the previous part of the analysis.

st_layers('data/london_boroughs.gpkg')
## Driver: GPKG 
## Available layers:
##   layer_name geometry_type features fields
## 1  london_22 Multi Polygon       15      7
## 2  london_33 Multi Polygon       20      7
## 3  london_24 Multi Polygon       33      7
## 4 london_112 Multi Polygon       29      7
## 5  london_52 Multi Polygon        3      7
london <- st_read('data/london_boroughs.gpkg', layer = 'london_24')
## Reading layer `london_24' from data source 
##   `C:\Users\elena\Desktop\Studies and work\Urban_Programming\lab_exam\London spatial\data\london_boroughs.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 33 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 672920.2 ymin: 5685642 xmax: 731187.7 ymax: 5730735
## Projected CRS: WGS 84 / UTM zone 30N

6.2 Plot a map of boroughs boundaries you just loaded

london %>% st_geometry() %>% plot(lwd = 0.7)

mapview(london)

6.3 Check the spatial object’s data

6.3.1 Check the variable names of the spatial data set

names(london)
## [1] "NAME"       "GSS_CODE"   "HECTARES"   "NONLD_AREA" "ONS_INNER" 
## [6] "SUB_2009"   "SUB_2006"   "geom"

6.3.2 Check the structure of the spatial data set

str(london)
## Classes 'sf' and 'data.frame':   33 obs. of  8 variables:
##  $ NAME      : chr  "Kingston upon Thames" "Croydon" "Bromley" "Hounslow" ...
##  $ GSS_CODE  : chr  "E09000021" "E09000008" "E09000006" "E09000018" ...
##  $ HECTARES  : num  3726 8649 15013 5659 5554 ...
##  $ NONLD_AREA: num  0 0 0 60.8 0 ...
##  $ ONS_INNER : chr  "F" "F" "F" "F" ...
##  $ SUB_2009  : chr  NA NA NA NA ...
##  $ SUB_2006  : chr  NA NA NA NA ...
##  $ geom      :sfc_MULTIPOLYGON of length 33; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:1271, 1:2] 685963 685968 685974 685981 685989 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geom"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA
##   ..- attr(*, "names")= chr [1:7] "NAME" "GSS_CODE" "HECTARES" "NONLD_AREA" ...

6.4 Subset your molten dataset

6.5 Merge qulification data with spatial data

Merge/join your table (the one that you used for modelling above) to the spatial object. Show the structure of the resulting spatial object.

spatial_dat <- merge(london, data, by.x = "GSS_CODE", by.y = 'Code', all.x=TRUE)
str(spatial_dat)
## Classes 'sf' and 'data.frame':   33 obs. of  12 variables:
##  $ GSS_CODE         : chr  "E09000001" "E09000002" "E09000003" "E09000004" ...
##  $ NAME             : chr  "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
##  $ HECTARES         : num  315 3780 8675 6429 4323 ...
##  $ NONLD_AREA       : num  24.5 169.2 0 370.6 0 ...
##  $ ONS_INNER        : chr  "T" "F" "F" "F" ...
##  $ SUB_2009         : chr  NA NA NA NA ...
##  $ SUB_2006         : chr  NA NA NA NA ...
##  $ Borough          : chr  "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
##  $ Income_est       : int  59240 27360 38120 34440 30370 40150 41120 34730 34140 31340 ...
##  $ share_qualified  : num  NA 26.9 50.4 27.2 30.2 43.5 55.2 36.7 47.1 36.8 ...
##  $ share_unqualified: num  NA 13.9 6.1 8.6 11.2 7.5 8.5 8.9 11.1 8.7 ...
##  $ geometry         :sfc_MULTIPOLYGON of length 33; first list element: List of 1
##   ..$ :List of 1
##   .. ..$ : num [1:343, 1:2] 700427 700426 700425 700425 700426 ...
##   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
##  - attr(*, "sf_column")= chr "geometry"
##  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:11] "GSS_CODE" "NAME" "HECTARES" "NONLD_AREA" ...

6.6 Plot spatial data

Create a map showing income. Use ggplot, tmap or mapview, or any package you want.

mapview(spatial_dat, zcol='Income_est')

We can see that lower income districts are located in the North-Eastern part of the city, while higher income districts are in the city centre and South-East. Therefore we may be dealing with spatial autocorrelation.

7 Spatial models

Now you have an option to try and improve the regression model that you have built previously. Does the model suffer from spatial autocorrelation? You may add additional code blocks to structure your code and make it cleaner.

Let us find the neighbours for each district.

neighbours <- spdep::poly2nb(spatial_dat)
neighbours
## Neighbour list object:
## Number of regions: 33 
## Number of nonzero links: 136 
## Percentage nonzero weights: 12.48852 
## Average number of links: 4.121212

Let us construct the matrix of neghbours by using the initial sf-object.

Queen neighborhood type

lnd_nb_queen <- poly2nb(spatial_dat, queen = TRUE)
plot.nb(lnd_nb_queen, st_geometry(spatial_dat), lwd = 0.3)

Rook neighborhood type

lnd_nb_rook <- poly2nb(spatial_dat, queen = FALSE)
plot.nb(lnd_nb_rook, st_geometry(spatial_dat), lwd = 0.3)

We may estimate the number of neighbours for each district via two ways and compare in which case the variation will be lower.

n_nbrs_queen <- sapply(lnd_nb_queen, length)
n_nbrs_rook <- sapply(lnd_nb_rook, length)
compare_nbrs <- data.table(queen = n_nbrs_queen, rook = n_nbrs_rook)
compare_nbrs_long <- melt(compare_nbrs, measure.vars = c("queen", "rook"), value.name = "n_neighbours", variable.name = "type")
compare_nbrs_long %>%
  ggplot() +
  geom_histogram(aes(x = n_neighbours, col = type), fill = NA, binwidth = 1, position = "identity") +
  facet_wrap(~type)+
  theme_pubclean()

The distribution of neighbours is identical irregardless of neighborhood type.

weights <- nb2listw(lnd_nb_queen, style="W")
spatial_dat$lag_Income_est <- lag.listw(x = weights, spatial_dat$Income_est)
moran.mc(spatial_dat$Income_est, listw = weights, nsim = 999)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  spatial_dat$Income_est 
## weights: weights  
## number of simulations + 1: 1000 
## 
## statistic = 0.24636, observed rank = 981, p-value = 0.019
## alternative hypothesis: greater

According to the p-value (which is lower than the significance level), there is spatial autocorrelation of income between the districts.

local_moran_Income_est <- localmoran(x = spatial_dat$Income_est, listw = weights)
hist(local_moran_Income_est[,5], breaks = 20)

signif <- 0.05
spatial_dat$Income_est_sig <- local_moran_Income_est[,5] # get the 5th column of the matrix - the `Pr(z > 0)` column
spatial_dat$Income_est_li <- local_moran_Income_est[,1] - mean(local_moran_Income_est[,1])

spatial_dat <- spatial_dat %>% mutate(quad_sig_Income_est = case_when(Income_est_sig > signif ~ "non-significant",
                                                      Income_est > 0 & lag_Income_est > 0 & Income_est_sig <= signif ~ "high-high",
                                              Income_est < 0 & lag_Income_est < 0 & Income_est_sig <= signif ~ "low-low",
                                              Income_est > 0 & lag_Income_est < 0 & Income_est_sig <= signif ~ "high-low",
                                              Income_est < 0 & lag_Income_est > 0 & Income_est_sig <= signif ~ "low-high"))

table(spatial_dat$quad_sig_Income_est)
## 
##       high-high non-significant 
##               4              29
head(spatial_dat)
## Simple feature collection with 6 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 684700.2 ymin: 5686446 xmax: 723412.8 ymax: 5728067
## Projected CRS: WGS 84 / UTM zone 30N
##    GSS_CODE                 NAME  HECTARES NONLD_AREA ONS_INNER SUB_2009
## 1 E09000001       City of London   314.942     24.546         T     <NA>
## 2 E09000002 Barking and Dagenham  3779.934    169.150         F     <NA>
## 3 E09000003               Barnet  8674.837      0.000         F     <NA>
## 4 E09000004               Bexley  6428.649    370.619         F     <NA>
## 5 E09000005                Brent  4323.270      0.000         F     <NA>
## 6 E09000006              Bromley 15013.487      0.000         F     <NA>
##   SUB_2006              Borough Income_est share_qualified share_unqualified
## 1     <NA>       City of London      59240              NA                NA
## 2     <NA> Barking and Dagenham      27360            26.9              13.9
## 3     <NA>               Barnet      38120            50.4               6.1
## 4     <NA>               Bexley      34440            27.2               8.6
## 5     <NA>                Brent      30370            30.2              11.2
## 6     <NA>              Bromley      40150            43.5               7.5
##                         geometry lag_Income_est Income_est_sig Income_est_li
## 1 MULTIPOLYGON (((700427.3 57...       37496.00      0.6265667   -0.03006582
## 2 MULTIPOLYGON (((713156.7 57...       31733.33      0.1317454    0.86997600
## 3 MULTIPOLYGON (((693620.4 57...       34510.00      0.3842329   -0.30541421
## 4 MULTIPOLYGON (((716504.1 57...       36405.00      0.8798883   -0.21008148
## 5 MULTIPOLYGON (((694458.6 57...       41035.71      0.1009573   -0.82403334
## 6 MULTIPOLYGON (((709974.9 56...       34290.00      0.3006389   -0.43191213
##   quad_sig_Income_est
## 1     non-significant
## 2     non-significant
## 3     non-significant
## 4     non-significant
## 5     non-significant
## 6     non-significant

We can see that the significance is added to the last column of the dataset.

map_1 <- spatial_dat %>% dplyr::select(Income_est, Income_est_sig, quad_sig_Income_est)
mapview(map_1, zcol = "quad_sig_Income_est")

The only significantly different districts are the central district of London and some of the South-Eastern districts.

system.time( man_lagsarlm <- lagsarlm(log(Income_est) ~ share_qualified + share_unqualified, data = spatial_dat, listw = weights) )
##    user  system elapsed 
##    0.35    0.00    0.38
summary(man_lagsarlm)
## 
## Call:lagsarlm(formula = log(Income_est) ~ share_qualified + share_unqualified, 
##     data = spatial_dat, listw = weights)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.163949 -0.048236 -0.016882  0.048428  0.216464 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                     Estimate Std. Error z value      Pr(>|z|)
## (Intercept)       10.5103469  1.8519817  5.6752 0.00000001385
## share_qualified    0.0056926  0.0015438  3.6875     0.0002265
## share_unqualified -0.0331996  0.0072318 -4.5908 0.00000441638
## 
## Rho: 0.0028315, LR test value: 0.00023381, p-value: 0.9878
## Asymptotic standard error: 0.17474
##     z-value: 0.016204, p-value: 0.98707
## Wald statistic: 0.00026257, p-value: 0.98707
## 
## Log likelihood: 33.62559 for lag model
## ML residual variance (sigma squared): 0.0071583, (sigma: 0.084607)
## Number of observations: 32 
## Number of parameters estimated: 5 
## AIC: -57.251, (AIC for lm: -59.251)
## LM test for residual autocorrelation
## test value: 1.8234, p-value: 0.17691
system.time( man_errorsarlm <- errorsarlm(log(Income_est) ~ share_qualified + share_unqualified, data = spatial_dat, listw = weights) )
##    user  system elapsed 
##    0.33    0.00    0.34
summary(man_errorsarlm)
## 
## Call:
## errorsarlm(formula = log(Income_est) ~ share_qualified + share_unqualified, 
##     data = spatial_dat, listw = weights)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.158568 -0.049496 -0.015760  0.059591  0.199064 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                     Estimate Std. Error z value              Pr(>|z|)
## (Intercept)       10.5377073  0.1098399 95.9369 < 0.00000000000000022
## share_qualified    0.0058559  0.0015474  3.7844             0.0001541
## share_unqualified -0.0334749  0.0068105 -4.9152          0.0000008868
## 
## Lambda: 0.24087, LR test value: 0.96926, p-value: 0.32486
## Asymptotic standard error: 0.21551
##     z-value: 1.1177, p-value: 0.26371
## Wald statistic: 1.2492, p-value: 0.26371
## 
## Log likelihood: 34.1101 for error model
## ML residual variance (sigma squared): 0.0068368, (sigma: 0.082685)
## Number of observations: 32 
## Number of parameters estimated: 5 
## AIC: -58.22, (AIC for lm: -59.251)

Коэффициент автокорреляции ошибок - 0.24087.

8 Write a function

Write a simple function with two parameters.

This function must take as an sf object as a first parameter. The second parameter is a character vector of length one with the name of the variable that you want to create a histogram for. The function log-transforms the variable specified via the 2nd parameter and creates a histogram with title “Log-transformed variable histogram”.

If you cannot write such function, write a function that finds a 5th root of the product of squares of 3 numbers - all passed as arguments into the function, and the result should be divided by some number passed as the fourth argument to that function. The second and third arguments default value is 1. The first argument has no default value. The output of the function is a printout like You have entered numbers 3, 7 and 8. The result rounded to 2 digits is: 7.76. In any case, try to build the function in its basic form. You may not, for example, be able to create a proper printout for the function, but the result of the calculation may be correct.

Define your function below:

fun_1 <- function(sf_obj,
                  var) {
  sf_obj <- sf_obj %>% st_set_geometry(NULL)
  var_vector = log(sf_obj[,var])
  hist_toplot <- hist(var_vector, main="Log-transformed variable histogram", breaks=33)
  return(hist_toplot)
}

Test your function three times with different values for parameters in the code chunk below:

fun_1(spatial_dat, 'Income_est')

## $breaks
##  [1] 10.18 10.20 10.22 10.24 10.26 10.28 10.30 10.32 10.34 10.36 10.38 10.40
## [13] 10.42 10.44 10.46 10.48 10.50 10.52 10.54 10.56 10.58 10.60 10.62 10.64
## [25] 10.66 10.68 10.70 10.72 10.74 10.76 10.78 10.80 10.82 10.84 10.86 10.88
## [37] 10.90 10.92 10.94 10.96 10.98 11.00
## 
## $counts
##  [1] 1 1 0 0 0 0 0 2 1 0 3 3 1 6 0 1 1 2 1 1 0 1 3 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0
## [39] 0 0 1
## 
## $density
##  [1] 1.515152 1.515152 0.000000 0.000000 0.000000 0.000000 0.000000 3.030303
##  [9] 1.515152 0.000000 4.545455 4.545455 1.515152 9.090909 0.000000 1.515152
## [17] 1.515152 3.030303 1.515152 1.515152 0.000000 1.515152 4.545455 0.000000
## [25] 0.000000 1.515152 1.515152 0.000000 0.000000 0.000000 0.000000 1.515152
## [33] 0.000000 1.515152 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [41] 1.515152
## 
## $mids
##  [1] 10.19 10.21 10.23 10.25 10.27 10.29 10.31 10.33 10.35 10.37 10.39 10.41
## [13] 10.43 10.45 10.47 10.49 10.51 10.53 10.55 10.57 10.59 10.61 10.63 10.65
## [25] 10.67 10.69 10.71 10.73 10.75 10.77 10.79 10.81 10.83 10.85 10.87 10.89
## [37] 10.91 10.93 10.95 10.97 10.99
## 
## $xname
## [1] "var_vector"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
fun_1(spatial_dat, 'share_qualified')

## $breaks
##  [1] 2.90 2.95 3.00 3.05 3.10 3.15 3.20 3.25 3.30 3.35 3.40 3.45 3.50 3.55 3.60
## [16] 3.65 3.70 3.75 3.80 3.85 3.90 3.95 4.00 4.05 4.10 4.15 4.20
## 
## $counts
##  [1] 1 0 0 0 0 0 0 1 1 0 1 0 0 2 4 1 0 3 2 5 2 1 3 4 0 1
## 
## $density
##  [1] 0.625 0.000 0.000 0.000 0.000 0.000 0.000 0.625 0.625 0.000 0.625 0.000
## [13] 0.000 1.250 2.500 0.625 0.000 1.875 1.250 3.125 1.250 0.625 1.875 2.500
## [25] 0.000 0.625
## 
## $mids
##  [1] 2.925 2.975 3.025 3.075 3.125 3.175 3.225 3.275 3.325 3.375 3.425 3.475
## [13] 3.525 3.575 3.625 3.675 3.725 3.775 3.825 3.875 3.925 3.975 4.025 4.075
## [25] 4.125 4.175
## 
## $xname
## [1] "var_vector"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
fun_1(spatial_dat, 'share_unqualified')

## $breaks
##  [1] 1.40 1.45 1.50 1.55 1.60 1.65 1.70 1.75 1.80 1.85 1.90 1.95 2.00 2.05 2.10
## [16] 2.15 2.20 2.25 2.30 2.35 2.40 2.45 2.50 2.55 2.60 2.65
## 
## $counts
##  [1] 1 0 0 0 2 0 1 0 1 0 1 1 4 1 1 3 0 1 3 2 5 1 1 0 3
## 
## $density
##  [1] 0.625 0.000 0.000 0.000 1.250 0.000 0.625 0.000 0.625 0.000 0.625 0.625
## [13] 2.500 0.625 0.625 1.875 0.000 0.625 1.875 1.250 3.125 0.625 0.625 0.000
## [25] 1.875
## 
## $mids
##  [1] 1.425 1.475 1.525 1.575 1.625 1.675 1.725 1.775 1.825 1.875 1.925 1.975
## [13] 2.025 2.075 2.125 2.175 2.225 2.275 2.325 2.375 2.425 2.475 2.525 2.575
## [25] 2.625
## 
## $xname
## [1] "var_vector"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

8.1 Error checking

Rewrite your function in such a way that it checks if the name of the variable to log-transform and creates a histogram that you are passing to it actually exists in the dataset. If it does not exist, return and error message with print() instead of trying to plot the variable that does not exist.

fun_1 <- function(sf_obj,
                  var) {
  if(var %in% colnames(sf_obj)){
  sf_obj <- sf_obj %>% st_set_geometry(NULL)
  var_vector = log(sf_obj[,var])
  hist_toplot <- hist(var_vector, main="Log-transformed variable histogram", breaks=33)
  return(hist_toplot)
                  }
  else {
    return(print('Error! This variable does not exist'))
  }
}
fun_1(spatial_dat, 'ty')
## [1] "Error! This variable does not exist"